import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import missingno as msno # 观测缺失数据特别好的模块
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn import svm #support vector Machine
from sklearn.ensemble import RandomForestClassifier #Random Forest
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.naive_bayes import GaussianNB #Naive bayes
from sklearn.tree import DecisionTreeClassifier #Decision Tree
from sklearn.model_selection import train_test_split #training and testing data split
from sklearn import metrics   #accuracy measure 计算预测精度
from sklearn.metrics import confusion_matrix #for confusion matrix
from sklearn.model_selection import KFold #for K-fold cross validation，交叉验证的轮数
from sklearn.model_selection import cross_val_score #score evaluation
from sklearn.model_selection import cross_val_predict #prediction
from sklearn.model_selection import GridSearchCV    # 使用GridSearchCV挑选一组参数中最好的参数
from sklearn.ensemble import VotingClassifier
'''泰坦尼克号获救预测'''
'''
1、当不清楚使用哪种分类模型时，可以使用 cross_val_score 来分别计算不同模型的评分，选择评分高的模型
2、当不清楚模型应该选择的最佳参数时，可以使用GridSearchCV 来遍历参数列表，选择得分最高的一组参数
3、metrics.accuracy_score 可以利用预测值和真实值，计算预测精确度
4、feature_importances_属性 可以查看特征的重要性
'''
def run():
    data = pd.read_csv('E:\培训教程\python\数据挖掘案例\泰坦尼克号\\train.csv')
    pd.set_option('display.max_columns', 500)
    '''取消告警记录的提示'''
    warnings.filterwarnings('ignore')
    #data_visibile(data)
    sex_analysis(data)
    #p_class_analysis(data)
    #p_class_sex_analysis(data)
    data = isnull_data(data)
    data = Embarked_analysis(data)
    #sibsip_analysis(data)
    #date_cormatrix(data)
    data = Feature_enginerring(data)
    data = ml_model(data)
    #parm_modify(data)
    Feature_importance(data)

'''数据可视化分析'''
def data_visibile(data):
    '''从大体上了解数据'''
    print(data.describe())

    '''统计哪些列有缺失数据，分别有多少'''
    print(data.isnull().sum())

    '''可视化缺失数据'''
    msno.matrix(data)
    plt.show()

    '''观测获救比例和人数'''
    f, ax = plt.subplots(1, 2, figsize=(18, 8))
    data['Survived'].value_counts().plot.pie(explode=[0, 0.1], autopct='%1.1f%%', ax=ax[0], shadow=True)
    ax[0].set_title('Survived')
    ax[0].set_ylabel('')
    '''使用条形图显示每个分类库中的观察计数
    http://seaborn.pydata.org/generated/seaborn.countplot.html?highlight=countplot#seaborn.countplot'''
    sns.countplot('Survived', data=data, ax=ax[1])
    ax[1].set_title('Survived')
    plt.show()

'''性别特征分析'''
def sex_analysis(data):
    '''按性别和Survived分组，统计获救和未获救的人数'''
    sex_analysis = data.groupby(['Sex', 'Survived'])['Survived'].count()
    print(sex_analysis)

    f, ax = plt.subplots(1, 2, figsize=(18, 8))
    '''男女获救比例'''
    data[['Sex', 'Survived']].groupby(['Sex']).mean().plot.bar(ax=ax[0])
    ax[0].set_title('Survived vs Sex')
    '''男女人数和获救人数，人数上，男的多，女的少；获救上，女的多，男的少'''
    sns.countplot('Sex', hue='Survived', data=data, ax=ax[1])
    ax[1].set_title('Sex:Survived vs Dead')
    plt.show()

'''船舱等级对获救率的影响'''
def p_class_analysis(data):
    '''按照data.Pclass分组(索引)，按照 data.Survived计数(表头)'''
    cross = pd.crosstab(data.Pclass, data.Survived, margins=True)
    print(cross)

    f, ax = plt.subplots(1, 2, figsize=(18, 8))
    data['Pclass'].value_counts().plot.bar(color=['#CD7F32', '#FFDF00', '#D3D3D3'], ax=ax[0])
    ax[0].set_title('Number Of Passengers By Pclass')
    ax[0].set_ylabel('Count')
    sns.countplot('Pclass', hue='Survived', data=data, ax=ax[1])
    ax[1].set_title('Pclass:Survived vs Dead')
    plt.show()

'''船舱等级和性别对获救率的影响'''
'''使用点图描述 factorplot'''
def p_class_sex_analysis(data):
    cross = pd.crosstab([data.Sex, data.Survived], data.Pclass, margins=True)
    print(cross)
    sns.factorplot('Pclass', 'Survived', hue='Sex', data=data)
    plt.show()

'''对数据中缺失值的填充'''
def isnull_data(data):
    data['Initial'] = 0
    for i in data:
        '''提取名称的前缀,可按照名称前缀预测性别，构造出一个新的特征'''
        data['Initial'] = data.Name.str.extract('([A-Za-z]+)\.')
    cross = pd.crosstab(data.Initial,data.Sex).T
    #print(cross)

    data['Initial'].replace(
        ['Mlle', 'Mme', 'Ms', 'Dr', 'Major', 'Lady', 'Countess', 'Jonkheer', 'Col', 'Rev', 'Capt', 'Sir', 'Don'],
        ['Miss', 'Miss', 'Miss', 'Mr', 'Mr', 'Mrs', 'Mrs', 'Other', 'Other', 'Other', 'Mr', 'Mr', 'Mr'], inplace=True)
    '''使用每组的均值来进行填充，loc选择df结构中的行和列'''
    data.loc[(data.Age.isnull()) & (data.Initial == 'Mr'), 'Age'] = 33
    data.loc[(data.Age.isnull()) & (data.Initial == 'Mrs'), 'Age'] = 36
    data.loc[(data.Age.isnull()) & (data.Initial == 'Master'), 'Age'] = 5
    data.loc[(data.Age.isnull()) & (data.Initial == 'Miss'), 'Age'] = 22
    data.loc[(data.Age.isnull()) & (data.Initial == 'Other'), 'Age'] = 46

    # f, ax = plt.subplots(1, 2, figsize=(20, 10))
    # data[data['Survived'] == 0].Age.plot.hist(ax=ax[0], bins=20, edgecolor='black', color='red')
    # ax[0].set_title('Survived= 0')
    # x1 = list(range(0, 85, 5))
    # ax[0].set_xticks(x1)
    # data[data['Survived'] == 1].Age.plot.hist(ax=ax[1], color='green', bins=20, edgecolor='black')
    # ax[1].set_title('Survived= 1')
    # x2 = list(range(0, 85, 5))
    # ax[1].set_xticks(x2)
    #plt.show()

    return data

def Embarked_analysis(data):
    cross = pd.crosstab([data.Embarked, data.Pclass], [data.Sex, data.Survived], margins=True)
    print(cross)
    '''港口中也存在缺失值，在这里我用众数来进行填充了，因为S登船人最多呀'''
    data['Embarked'].fillna('S', inplace=True)

    sns.factorplot('Embarked', 'Survived', data=data)
    fig = plt.gcf()
    fig.set_size_inches(5, 3)
    #plt.show()

    '''C港生存的可能性最高在0.55左右，而S的生存率最低。
    http://seaborn.pydata.org/generated/seaborn.countplot.html?highlight=countplot#seaborn.countplot'''
    f, ax = plt.subplots(2, 2, figsize=(20, 15))
    sns.countplot('Embarked', data=data, ax=ax[0, 0])
    ax[0, 0].set_title('No. Of Passengers Boarded')
    sns.countplot('Embarked', hue='Sex', data=data, ax=ax[0, 1])
    ax[0, 1].set_title('Male-Female Split for Embarked')
    sns.countplot('Embarked', hue='Survived', data=data, ax=ax[1, 0])
    ax[1, 0].set_title('Embarked vs Survived')
    sns.countplot('Embarked', hue='Pclass', data=data, ax=ax[1, 1])
    ax[1, 1].set_title('Embarked vs Pclass')
    plt.subplots_adjust(wspace=0.2, hspace=0.5)
    #plt.show()

    return data

def sibsip_analysis(data):
    '''兄弟姐妹的数量对获救率的影响'''
    cross = pd.crosstab([data.SibSp],data.Survived,margins=True)
    print(cross)
    '''http://seaborn.pydata.org/generated/seaborn.barplot.html?highlight=barplot#seaborn.barplot'''
    f, ax = plt.subplots(1, 2, figsize=(20, 8))
    sns.barplot('SibSp', 'Survived', data=data, ax=ax[0])
    ax[0].set_title('SibSp vs Survived')
    sns.factorplot('SibSp', 'Survived', data=data, ax=ax[1])
    ax[1].set_title('SibSp vs Survived')
    plt.show()

    cross = pd.crosstab(data.SibSp, data.Pclass)
    print(cross)

'''特征之间的相关性'''
def date_cormatrix(data):
    '''计算数据相关性矩阵,df的corr函数可以计算矩阵中特征之间的相关性，输出相关性矩阵'''
    cormatrix = data.corr()
    '''使用数据相关性矩阵，绘制特征相关性热力图'''
    sns.heatmap(cormatrix, annot=True, cmap='RdYlGn', linewidths=0.2)  # data.corr()-->correlation matrix
    plt.show()

'''特征工程'''
def Feature_enginerring(data):

    '''年龄数据离散化'''
    data['Age_band'] = 0
    data.loc[data['Age'] <= 16, 'Age_band'] = 0
    data.loc[(data['Age'] > 16) & (data['Age'] <= 32), 'Age_band'] = 1
    data.loc[(data['Age'] > 32) & (data['Age'] <= 48), 'Age_band'] = 2
    data.loc[(data['Age'] > 48) & (data['Age'] <= 64), 'Age_band'] = 3
    data.loc[data['Age'] > 64, 'Age_band'] = 4
    print(data.head())

    '''统计个年龄段的人数'''
    print(data['Age_band'].value_counts().to_frame())

    '''从图中可以很明显的看出，随着年龄的增加，获救的概率降低'''
    sns.factorplot('Age_band', 'Survived', data=data, col='Pclass')
    #plt.show()


    '''family_size = 0意味着passeneger是孤独的。显然，如果你是单独或family_size = 0，那么生存的机会很低。
    家庭规模4以上，机会也减少。这看起来也是模型的一个重要特性。让我们进一步研究这个问题。
    Family_size：家庭总人数,光看兄弟姐妹和老人孩子看不太直接，咱们直接看全家的人数'''
    data['Family_Size'] = 0
    data['Family_Size'] = data['Parch'] + data['SibSp']  # family size
    data['Alone'] = 0
    data.loc[data.Family_Size == 0, 'Alone'] = 1  # Alone

    f, ax = plt.subplots(1, 2, figsize=(18, 6))
    sns.factorplot('Family_Size', 'Survived', data=data, ax=ax[0])
    ax[0].set_title('Family_Size vs Survived')
    sns.factorplot('Alone', 'Survived', data=data, ax=ax[1])
    ax[1].set_title('Alone vs Survived')

    #plt.show()

    '''船票价格,因为票价也是连续的特性，所以我们需要将它转换为数值。pandas.qcut
    显然，随着fare_cat增加，存活的几率增加。随着性别的变化，这一特性可能成为建模过程中的一个重要特征'''

    data['Fare_Range'] = pd.qcut(data['Fare'], 4)
    '''很明显，随着票价的提高，生存几率提高'''
    print(data.groupby(['Fare_Range'])['Survived'].mean().to_frame())

    data['Fare_cat'] = 0
    data.loc[data['Fare'] <= 7.91, 'Fare_cat'] = 0
    data.loc[(data['Fare'] > 7.91) & (data['Fare'] <= 14.454), 'Fare_cat'] = 1
    data.loc[(data['Fare'] > 14.454) & (data['Fare'] <= 31), 'Fare_cat'] = 2
    data.loc[(data['Fare'] > 31) & (data['Fare'] <= 513), 'Fare_cat'] = 3

    sns.factorplot('Fare_cat', 'Survived', data=data, hue='Sex')
    #plt.show()


    '''将字符串值转换为数字 因为我们不能把字符串一个机器学习模型'''
    data['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
    data['Embarked'].replace(['S', 'C', 'Q'], [0, 1, 2], inplace=True)
    data['Initial'].replace(['Mr', 'Mrs', 'Miss', 'Master', 'Other'], [0, 1, 2, 3, 4], inplace=True)

    '''
    去掉不必要的特征
    名称>我们不需要name特性，因为它不能转换成任何分类值
    年龄——>我们有age_band特征，所以不需要这个
    票号-->这是任意的字符串，不能被归类
    票价——>我们有fare_cat特征，所以不需要
    船仓号——>这个也不要没啥含义
    passengerid -->不能被归类
    '''

    data.drop(['Name', 'Age', 'Ticket', 'Fare', 'Cabin', 'Fare_Range', 'PassengerId'], axis=1, inplace=True)

    '''查看特征相关性'''
    date_cormatrix(data)

    return data

def ml_model(data):
    train, test = train_test_split(data, test_size=0.3, random_state=0, stratify=data['Survived'])
    train_X = train[train.columns[1:]]
    train_Y = train[train.columns[:1]]
    test_X = test[test.columns[1:]]
    test_Y = test[test.columns[:1]]
    X = data[data.columns[1:]]
    Y = data['Survived']

    '''使用不同的机器学习算法分类，比较分类效果
    metrics.accuracy_score函数在传入预测值和真实值后，可以计算分类效果的评分
    '''
    # Radial Support Vector Machines(rbf-SVM)
    model = svm.SVC(kernel='rbf', C=1, gamma=0.1)
    model.fit(train_X, train_Y)
    prediction1 = model.predict(test_X)
    print('Accuracy for rbf SVM is ', metrics.accuracy_score(prediction1, test_Y))

    #Linear Support Vector Machine(linear-SVM)
    model = svm.SVC(kernel='linear', C=0.1, gamma=0.1)
    model.fit(train_X, train_Y)
    prediction2 = model.predict(test_X)
    print('Accuracy for linear SVM is', metrics.accuracy_score(prediction2, test_Y))

    #Logistic Regression
    model = LogisticRegression()
    model.fit(train_X, train_Y)
    prediction3 = model.predict(test_X)
    print('The accuracy of the Logistic Regression is', metrics.accuracy_score(prediction3, test_Y))

    # Decision Tree
    model = DecisionTreeClassifier()
    model.fit(train_X, train_Y)
    prediction4 = model.predict(test_X)
    print('The accuracy of the Decision Tree is', metrics.accuracy_score(prediction4, test_Y))

    # K-Nearest Neighbours(KNN)
    model = KNeighborsClassifier()
    model.fit(train_X, train_Y)
    prediction5 = model.predict(test_X)
    print('The accuracy of the KNN is', metrics.accuracy_score(prediction5, test_Y))

    model = GaussianNB()
    model.fit(train_X, train_Y)
    prediction6 = model.predict(test_X)
    print('The accuracy of the NaiveBayes is', metrics.accuracy_score(prediction6, test_Y))

    model = RandomForestClassifier(n_estimators=100)
    model.fit(train_X, train_Y)
    prediction7 = model.predict(test_X)
    print('The accuracy of the Random Forests is', metrics.accuracy_score(prediction7, test_Y))

    '''交叉验证，cross_val_score 通过交叉验证来评估分数
    api文档：https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html#sklearn.model_selection.cross_val_score
    可以通过评分值来选择最好的模型
    '''
    kfold = KFold(n_splits=10, random_state=22)  # k=10, split the data into 10 equal parts
    xyz = []
    accuracy = []
    std = []
    classifiers = ['Linear Svm', 'Radial Svm', 'Logistic Regression', 'KNN', 'Decision Tree', 'Naive Bayes','Random Forest']
    models = [svm.SVC(kernel='linear'), svm.SVC(kernel='rbf'), LogisticRegression(),
              KNeighborsClassifier(n_neighbors=9), DecisionTreeClassifier(), GaussianNB(),
              RandomForestClassifier(n_estimators=100)]
    for i in models:
        model = i
        cv_result = cross_val_score(model, X, Y, cv=kfold, scoring="accuracy")
        cv_result = cv_result
        xyz.append(cv_result.mean())
        std.append(cv_result.std())
        accuracy.append(cv_result)
    new_models_dataframe2 = pd.DataFrame({'CV Mean': xyz, 'Std': std}, index=classifiers)
    print(new_models_dataframe2)

    new_models_dataframe2['CV Mean'].plot.barh(width=0.8)
    plt.title('Average CV Mean Accuracy')
    fig = plt.gcf()
    fig.set_size_inches(8, 5)
    #plt.show()

    '''
    混淆矩阵 它给出分类器的正确和不正确分类的数量
    解释混淆矩阵：来看第一个图
    1）预测的正确率为491（死亡）+ 247（存活），平均CV准确率为（491+247）/ 891＝82.8%。
    2）58和95都是咱们弄错了的。
    '''
    f, ax = plt.subplots(3, 3, figsize=(12, 10))
    y_pred = cross_val_predict(svm.SVC(kernel='rbf'), X, Y, cv=10)
    sns.heatmap(confusion_matrix(Y, y_pred), ax=ax[0, 0], annot=True, fmt='2.0f')
    ax[0, 0].set_title('Matrix for rbf-SVM')
    y_pred = cross_val_predict(svm.SVC(kernel='linear'), X, Y, cv=10)
    sns.heatmap(confusion_matrix(Y, y_pred), ax=ax[0, 1], annot=True, fmt='2.0f')
    ax[0, 1].set_title('Matrix for Linear-SVM')
    y_pred = cross_val_predict(KNeighborsClassifier(n_neighbors=9), X, Y, cv=10)
    sns.heatmap(confusion_matrix(Y, y_pred), ax=ax[0, 2], annot=True, fmt='2.0f')
    ax[0, 2].set_title('Matrix for KNN')
    y_pred = cross_val_predict(RandomForestClassifier(n_estimators=100), X, Y, cv=10)
    sns.heatmap(confusion_matrix(Y, y_pred), ax=ax[1, 0], annot=True, fmt='2.0f')
    ax[1, 0].set_title('Matrix for Random-Forests')
    y_pred = cross_val_predict(LogisticRegression(), X, Y, cv=10)
    sns.heatmap(confusion_matrix(Y, y_pred), ax=ax[1, 1], annot=True, fmt='2.0f')
    ax[1, 1].set_title('Matrix for Logistic Regression')
    y_pred = cross_val_predict(DecisionTreeClassifier(), X, Y, cv=10)
    sns.heatmap(confusion_matrix(Y, y_pred), ax=ax[1, 2], annot=True, fmt='2.0f')
    ax[1, 2].set_title('Matrix for Decision Tree')
    y_pred = cross_val_predict(GaussianNB(), X, Y, cv=10)
    sns.heatmap(confusion_matrix(Y, y_pred), ax=ax[2, 0], annot=True, fmt='2.0f')
    ax[2, 0].set_title('Matrix for Naive Bayes')
    plt.subplots_adjust(hspace=0.2, wspace=0.2)
    #plt.show()
    return data


'''调整参数，查看对算法分类效果的影响，使用GridSearchCV穷尽列出的参数组合'''
# GridSearchCV的使用 https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV
'''hyper 中字典类型的键名，必须是算法中实际存在的参数名称'''
def parm_modify(data):
    train, test = train_test_split(data, test_size=0.3, random_state=0, stratify=data['Survived'])
    train_X = train[train.columns[1:]]
    train_Y = train[train.columns[:1]]
    test_X = test[test.columns[1:]]
    test_Y = test[test.columns[:1]]
    X = data[data.columns[1:]]
    Y = data['Survived']

    '''搜索支持向量机中最好的参数组合'''
    C = [0.05, 0.1, 0.2, 0.3, 0.25, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
    gamma = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    kernel = ['rbf', 'linear']
    hyper = {'kernel': kernel, 'C': C, 'gamma': gamma}
    gd = GridSearchCV(estimator=svm.SVC(), param_grid=hyper, verbose=True)
    gd.fit(X, Y)
    print('The SVC best best_score_ is :' ,gd.best_score_)
    print('The SVC best best_estimator_ is :' ,gd.best_estimator_)

    '''搜索随机森林中最好的参数组合'''
    n_estimators = range(100, 1000, 50)
    hyper = {'n_estimators': n_estimators}
    gd = GridSearchCV(estimator=RandomForestClassifier(random_state=0), param_grid=hyper, verbose=True)
    gd.fit(X, Y)
    print('The RandomForest best best_score_ is :' , gd.best_score_)
    print('The RandomForest best best_estimator_ is :' , gd.best_estimator_)


'''调用随机森林算法的feature_importances_ 属性，即可得到特征重要性的array'''
def Feature_importance(data):
    train, test = train_test_split(data, test_size=0.3, random_state=0, stratify=data['Survived'])
    X = data[data.columns[1:]]
    Y = data['Survived']
    f, ax = plt.subplots(figsize=(15, 12))
    model = RandomForestClassifier(n_estimators=500, random_state=0)
    model.fit(X, Y)
    pd.Series(model.feature_importances_, X.columns).sort_values(ascending=True).plot.barh(width=0.8)
    ax.set_title('Feature Importance in Random Forests')

    plt.show()

if __name__=='__main__':
    run()